CSE314 Group Project
  • Project Overview
  • Data Overview
  • Data Preprocess
  • Data Visualization
  • Demo Feature Selection
  • Database
  • Model - Logistics
    • EDA
    • Feature Engineering
    • Result
  • Model - Random Forest
  • Model Examination - Dashboard
  • Kaggle Competition
CSE314 Group Project
  • »
  • Model - Logistics

Model - Logistics¶

In [2]:
Copied!
from pandas_profiling import ProfileReport
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport import pandas as pd import numpy as np
c:\Users\ericl\miniconda3\lib\site-packages\tqdm\auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

EDA¶

In [55]:
Copied!
df = pd.read_csv('train.csv')
df = pd.read_csv('train.csv')
In [11]:
Copied!
df.head()
df.head()
Out[11]:
raw_row_number location county_name subject_age subject_race subject_sex officer_id_hash department_name type arrest_made ... outcome frisk_performed search_conducted search_person search_vehicle reason_for_stop raw_Ethnicity raw_Race raw_action_description date_time
0 12511107 NaN forsyth county 18.0 white male f2f6b08c97 Winston-Salem Police Department vehicular False ... citation False False False False Speed Limit Violation N W Citation Issued 2010-12-05 01:51:24
1 5439683 raleigh wake county 25.0 hispanic male 1e3fa73f20 Raleigh Police Department vehicular False ... warning False True True True Vehicle Regulatory Violation H W Verbal Warning 2005-09-25 03:40:00
2 18674698 charlotte area mecklenburg county 30.0 black female 59a754eb04 Charlotte-Mecklenburg Police Department vehicular False ... warning False False False False Speed Limit Violation N B Verbal Warning 2014-11-15 02:00:00
3 12600300 charlotte area mecklenburg county 21.0 white male 0dc507ea69 Charlotte-Mecklenburg Police Department vehicular False ... warning False False False False Vehicle Regulatory Violation N W Verbal Warning 2011-01-23 00:16:00
4 6035053 NaN durham county 38.0 black female 91822b2dfe Durham Police Department vehicular False ... citation False False False False Speed Limit Violation N B Citation Issued 2006-06-18 10:17:17

5 rows × 22 columns

Feature Engineering¶

In [7]:
Copied!
profile = ProfileReport(df, title="Police Stop Profiling Report")
profile
profile = ProfileReport(df, title="Police Stop Profiling Report") profile
Summarize dataset: 100%|██████████| 41/41 [06:19<00:00,  9.25s/it, Completed]                               
Generate report structure: 100%|██████████| 1/1 [00:07<00:00,  7.29s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.73s/it]
Out[7]:

In [ ]:
Copied!

In [56]:
Copied!
#Getting rid of unneccessary columns 
df["outcome"].replace(['warning', 'citation', 'arrest'], [0, 1, 2], inplace=True)
df.drop(columns=['date_time', 'department_name', 'county_name', 'type', 'raw_Race', 'raw_Ethnicity', 'raw_action_description', 'subject_race', 'location', 'raw_row_number', 'officer_id_hash', 'arrest_made', 'citation_issued', 'warning_issued'], inplace=True)
#Getting rid of unneccessary columns df["outcome"].replace(['warning', 'citation', 'arrest'], [0, 1, 2], inplace=True) df.drop(columns=['date_time', 'department_name', 'county_name', 'type', 'raw_Race', 'raw_Ethnicity', 'raw_action_description', 'subject_race', 'location', 'raw_row_number', 'officer_id_hash', 'arrest_made', 'citation_issued', 'warning_issued'], inplace=True)
In [57]:
Copied!
#Change reason for stop and boolean columns to numerical
df = pd.concat([df, df['reason_for_stop'].str.get_dummies()], axis=1)
df.drop(columns=['reason_for_stop'], inplace=True)
df['subject_sex'].replace({'male': 1, 'female': 0}, inplace=True)
df['frisk_performed'].replace({True: 1, False: 0}, inplace=True)
df['search_conducted'].replace({True: 1, False: 0}, inplace=True)
df['search_person'].replace({True: 1, False: 0}, inplace=True)
df['search_vehicle'].replace({True: 1, False: 0}, inplace=True)
#Change reason for stop and boolean columns to numerical df = pd.concat([df, df['reason_for_stop'].str.get_dummies()], axis=1) df.drop(columns=['reason_for_stop'], inplace=True) df['subject_sex'].replace({'male': 1, 'female': 0}, inplace=True) df['frisk_performed'].replace({True: 1, False: 0}, inplace=True) df['search_conducted'].replace({True: 1, False: 0}, inplace=True) df['search_person'].replace({True: 1, False: 0}, inplace=True) df['search_vehicle'].replace({True: 1, False: 0}, inplace=True)
In [76]:
Copied!
ProfileReport(df, title='Useful Profile Report')
ProfileReport(df, title='Useful Profile Report')
Summarize dataset: 100%|██████████| 32/32 [09:45<00:00, 18.29s/it, Completed]                                     
Generate report structure: 100%|██████████| 1/1 [00:09<00:00,  9.28s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.35s/it]
Out[76]:

In [66]:
Copied!
#read test files and also perform the same changes
test = pd.read_csv('test.csv')
test["outcome"].replace(['warning', 'citation', 'arrest'], [0, 1, 2], inplace=True)
test.drop(columns=['date_time', 'department_name', 'county_name', 'type', 'raw_Race', 'raw_Ethnicity', 'raw_action_description', 'subject_race', 'location', 'raw_row_number', 'officer_id_hash', 'arrest_made', 'citation_issued', 'warning_issued'], inplace=True)
test = pd.concat([test, test['reason_for_stop'].str.get_dummies()], axis=1)
test.drop(columns=['reason_for_stop'], inplace=True)
test['subject_sex'].replace({'male': 1, 'female': 0}, inplace=True)
test['frisk_performed'].replace({True: 1, False: 0}, inplace=True)
test['search_conducted'].replace({True: 1, False: 0}, inplace=True)
test['search_person'].replace({True: 1, False: 0}, inplace=True)
test['search_vehicle'].replace({True: 1, False: 0}, inplace=True)
#read test files and also perform the same changes test = pd.read_csv('test.csv') test["outcome"].replace(['warning', 'citation', 'arrest'], [0, 1, 2], inplace=True) test.drop(columns=['date_time', 'department_name', 'county_name', 'type', 'raw_Race', 'raw_Ethnicity', 'raw_action_description', 'subject_race', 'location', 'raw_row_number', 'officer_id_hash', 'arrest_made', 'citation_issued', 'warning_issued'], inplace=True) test = pd.concat([test, test['reason_for_stop'].str.get_dummies()], axis=1) test.drop(columns=['reason_for_stop'], inplace=True) test['subject_sex'].replace({'male': 1, 'female': 0}, inplace=True) test['frisk_performed'].replace({True: 1, False: 0}, inplace=True) test['search_conducted'].replace({True: 1, False: 0}, inplace=True) test['search_person'].replace({True: 1, False: 0}, inplace=True) test['search_vehicle'].replace({True: 1, False: 0}, inplace=True)
In [71]:
Copied!
#Seperate data for modeling
X_train = df.drop('outcome', axis=1)
X_test = test.drop('outcome', axis=1)
Y_train = df['outcome']
Y_test = test['outcome']
#Seperate data for modeling X_train = df.drop('outcome', axis=1) X_test = test.drop('outcome', axis=1) Y_train = df['outcome'] Y_test = test['outcome']
In [73]:
Copied!
from sklearn.linear_model import LogisticRegression

#Make the model
model = LogisticRegression(solver = 'liblinear')
model.fit(X_train,Y_train)
Y_pred = model.predict(X_test)
from sklearn.linear_model import LogisticRegression #Make the model model = LogisticRegression(solver = 'liblinear') model.fit(X_train,Y_train) Y_pred = model.predict(X_test)

Result¶

In [75]:
Copied!
#Accuracy
np.mean(Y_test==Y_pred)
#Accuracy np.mean(Y_test==Y_pred)
Out[75]:
0.661227829379674

Not Very Accurate!

In [78]:
Copied!
#Try with more relavent columns
X_train = df[['search_conducted', 'Vehicle Equipment Violation', 'Driving While Impaired', 'Safe Movement Violation', 'Speed Limit Violation']]
X_test = test[['search_conducted', 'Vehicle Equipment Violation', 'Driving While Impaired', 'Safe Movement Violation', 'Speed Limit Violation']]
Y_train = df['outcome']
Y_test = test['outcome']
#Try with more relavent columns X_train = df[['search_conducted', 'Vehicle Equipment Violation', 'Driving While Impaired', 'Safe Movement Violation', 'Speed Limit Violation']] X_test = test[['search_conducted', 'Vehicle Equipment Violation', 'Driving While Impaired', 'Safe Movement Violation', 'Speed Limit Violation']] Y_train = df['outcome'] Y_test = test['outcome']
In [79]:
Copied!
#Make the model
model = LogisticRegression(solver = 'liblinear')
model.fit(X_train,Y_train)
Y_pred = model.predict(X_test)

#Accuracy
np.mean(Y_test==Y_pred)
#Make the model model = LogisticRegression(solver = 'liblinear') model.fit(X_train,Y_train) Y_pred = model.predict(X_test) #Accuracy np.mean(Y_test==Y_pred)
Out[79]:
0.6585383071749138
Previous Next

Built with MkDocs using a theme provided by Read the Docs.
« Previous Next »